package au.com.acpfg.misc.fasta; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Collection; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.knime.base.node.util.BufferedFileReader; import org.knime.core.data.DataCell; import org.knime.core.data.DataColumnSpec; import org.knime.core.data.DataColumnSpecCreator; import org.knime.core.data.DataRow; import org.knime.core.data.DataTableSpec; import org.knime.core.data.DataType; import org.knime.core.data.RowKey; import org.knime.core.data.collection.CollectionCellFactory; import org.knime.core.data.collection.ListCell; import org.knime.core.data.def.DefaultRow; import org.knime.core.data.def.StringCell; import org.knime.core.node.BufferedDataContainer; import org.knime.core.node.BufferedDataTable; import org.knime.core.node.CanceledExecutionException; import org.knime.core.node.ExecutionContext; import org.knime.core.node.ExecutionMonitor; import org.knime.core.node.InvalidSettingsException; import org.knime.core.node.NodeLogger; import org.knime.core.node.NodeModel; import org.knime.core.node.NodeSettingsRO; import org.knime.core.node.NodeSettingsWO; import org.knime.core.node.defaultnodesettings.SettingsModelBoolean; import org.knime.core.node.defaultnodesettings.SettingsModelString; /** * This is the model implementation of FastaReader. * This nodes reads sequences from the user-specified FASTA file and outputs three columns per sequence: * * n1) Accession * * n2) Description - often not accurate in practice * * n3) Sequence data * n * n * Neither line breaks or leading/trailing whitespace are preserved. * * @author Andrew Cassin */ public class FastaReaderNodeModel extends NodeModel { // the logger instance private static final NodeLogger logger = NodeLogger .getLogger(FastaReaderNodeModel.class); /** the settings key which is used to retrieve and store the settings (from the dialog or from a settings file) (package visibility to be usable from the dialog). */ static final String CFGKEY_FASTA = "fasta-file"; static final String CFGKEY_ACCSN_RE = "accsn-regexp"; static final String CFGKEY_DESCR_RE = "description-regexp"; static final String CFGKEY_ENTRY_HANDLER = "entry-handler"; static final String CFGKEY_FASTADIR = "fasta-dir"; static final String CFGKEY_ISDIR = "read-entire-directory"; static final String CFGKEY_MAKESTATS= "make-statistics?"; /** initial sequence file */ private static final String DEFAULT_FASTA = "/tmp/sequences.fasta"; private static final String DEFAULT_ACCSN_RE = "^(\\S+)\\b"; private static final String DEFAULT_DESCR_RE = "^\\S+\\s*(.*)$"; private static final String DEFAULT_ENTRY_HANDLER = "single"; private static final String DEFAULT_FASTADIR = "c:/temp"; private static final Boolean DEFAULT_ISDIR = Boolean.FALSE; private static final Boolean DEFAULT_MAKESTATS = Boolean.FALSE; // dont waste memory and performance by default // settings for this node: regular expressions to process the ">" lines, and the fasta sequence filename private final SettingsModelString m_fasta = make(CFGKEY_FASTA); private final SettingsModelString m_accsn_re = make(CFGKEY_ACCSN_RE); private final SettingsModelString m_descr_re = make(CFGKEY_DESCR_RE); private final SettingsModelString m_entry_handler = make(CFGKEY_ENTRY_HANDLER); private final SettingsModelString m_fastadir = make(CFGKEY_FASTADIR); private final SettingsModelBoolean m_isdir = new SettingsModelBoolean(CFGKEY_ISDIR, DEFAULT_ISDIR); private final SettingsModelBoolean m_stats = new SettingsModelBoolean(CFGKEY_MAKESTATS, DEFAULT_MAKESTATS); /** * Constructor for the node model. */ protected FastaReaderNodeModel() { super(0, 2); // output ports only } public static SettingsModelString make(String k) { if (k.equals(CFGKEY_FASTA)) { return new SettingsModelString(k, DEFAULT_FASTA); } else if (k.equals(CFGKEY_ACCSN_RE)) { return new SettingsModelString(k, DEFAULT_ACCSN_RE); } else if (k.equals(CFGKEY_DESCR_RE)) { return new SettingsModelString(k, DEFAULT_DESCR_RE); } else if (k.equals(CFGKEY_ENTRY_HANDLER)) { return new SettingsModelString(k, DEFAULT_ENTRY_HANDLER); } else if (k.equals(CFGKEY_FASTADIR)) { SettingsModelString sms = new SettingsModelString(k, DEFAULT_FASTADIR); sms.setEnabled(false); // since default is for single file load return sms; } return null; } protected DataTableSpec make_output_spec(boolean as_single) { // 1. create the column specification in accordance with the as_single parameter DataColumnSpec[] allColSpecs = new DataColumnSpec[4]; DataType dt = as_single ? StringCell.TYPE : ListCell.getCollectionType(StringCell.TYPE); allColSpecs[0] = new DataColumnSpecCreator("Accession", dt).createSpec(); allColSpecs[1] = new DataColumnSpecCreator("Description", dt).createSpec(); allColSpecs[2] = new DataColumnSpecCreator("Sequence", StringCell.TYPE).createSpec(); allColSpecs[3] = new DataColumnSpecCreator("Filename", StringCell.TYPE).createSpec(); DataTableSpec outputSpec = new DataTableSpec(allColSpecs); return outputSpec; } /** * {@inheritDoc} */ @Override protected BufferedDataTable[] execute(final BufferedDataTable[] inData, final ExecutionContext exec) throws Exception { boolean as_single = m_entry_handler.getStringValue().equals("single"); DataTableSpec outputSpec = make_output_spec(as_single); DataTableSpec statSpec = SequenceStatistics.getOutputSpec(); ArrayList<String> filenames = new ArrayList<String>(); if (!m_isdir.getBooleanValue()) { filenames.add(m_fasta.getStringValue()); } else { File[] list = new File(m_fastadir.getStringValue()).listFiles(new FileFilter() { @Override public boolean accept(File arg0) { String fname = arg0.getName().toLowerCase(); if (fname.endsWith(".fa") || fname.endsWith(".fasta") || fname.endsWith(".txt") || fname.endsWith(".seq") || fname.endsWith(".fa.gz") || fname.endsWith(".fsa.gz") || fname.endsWith(".fsa") || fname.endsWith(".fna") || fname.endsWith(".fna.gz") || fname.endsWith(".fasta.gz") || fname.endsWith(".txt.gz") || fname.endsWith(".seq.gz") || fname.endsWith(".fa.z") || fname.endsWith(".fasta.z") || fname.endsWith(".txt.z") || fname.endsWith(".seq.z")) { return true; } return false; } }); for (File f : list) { filenames.add(f.getAbsolutePath()); } } if (filenames.size() < 1) { throw new InvalidSettingsException("No files to process!"); } else { logger.info("Found "+filenames.size()+" FASTA files to process."); } BufferedDataContainer container = exec.createDataContainer(outputSpec); BufferedDataContainer statsContainer = exec.createDataContainer(statSpec); long n_seq = 0; long n_seq_rej = 0; Pattern accsn_matcher = Pattern.compile(m_accsn_re.getStringValue()); Pattern descr_matcher = Pattern.compile(m_descr_re.getStringValue()); // let's add sequences from input file(s) into the output port String line = null; String[] accsn = null; String[] descr = null; StringBuffer seq = null; int files_done = 0; for (String fname : filenames) { logger.info("Processing FASTA file: "+fname); File input_sequences = new File(fname); SequenceStatistics stats = m_stats.getBooleanValue() ? new SequenceStatistics(input_sequences) : null; boolean is_compressed = false; if (fname.toLowerCase().endsWith(".gz") || fname.toLowerCase().endsWith(".z")) { is_compressed = true; } double portion = 1.0 / filenames.size(); double p_size = input_sequences.length(); if (p_size < 1) { logger.warn("Empty file: "+fname+", ignored."); files_done++; continue; } BufferedReader rseq; InputStream is = null; if (is_compressed) { is = new GZIPInputStream(new FileInputStream(input_sequences), 16*1024); } else { is = new FileInputStream(input_sequences); } rseq = new BufferedReader(new InputStreamReader(is)); fname = input_sequences.getName(); boolean done = false; boolean already_got_header = false; while (!done) { // get header line if (!already_got_header) { do { line = rseq.readLine(); if (line == null) { done = true; break; } } while (!line.startsWith(">")); } if (!done) { String[] entries = line.split("\\x01"); if (entries.length > 0 && entries[0].startsWith(">")) { entries[0] = entries[0].substring(1); // skip over > for parse_accession() } accsn = parse_accession(accsn_matcher,entries); descr = parse_description(descr_matcher,entries); String tline; seq = new StringBuffer(10 * 1024); boolean got_seq = false; already_got_header = false; int tline_len = 0; do { if ((line = rseq.readLine()) == null) { already_got_header = false; break; } tline = line.trim(); tline_len = tline.length(); if (tline_len > 0) { char first_c = tline.charAt(0); if (first_c == '>') { got_seq = false; already_got_header = true; break; } if (Character.isLetter(first_c) || first_c == '*' || first_c == '-') { seq.append(tline); got_seq = true; } } } while (tline_len == 0 || got_seq ); } // save the sequence to the container if (!done) { DataCell c1 = as_single ? new StringCell(accsn[0]) : CollectionCellFactory.createListCell(toDataCells(accsn)); DataCell c2 = as_single ? new StringCell(descr[0]) : CollectionCellFactory.createListCell(toDataCells(descr)); if (save_sequence(container, n_seq, c1, c2, seq, fname, stats)) { n_seq++; accsn = null; // help java garbage collector descr = null; } } if (n_seq % 1000 == 0) { try { // check if the execution monitor was canceled exec.checkCanceled(); } catch (CanceledExecutionException ce) { rseq.close(); // avoid open file leak throw ce; } // and update node progress "traffic light" double tmp = (((double)files_done)/filenames.size())*portion+((((double)0)/p_size)*portion); exec.setProgress(tmp, "Adding " + n_seq+" from "+fname); } } rseq.close(); if (stats != null) { stats.addStats(statsContainer); } files_done++; } // once we are done, we close the container and return its table container.close(); statsContainer.close(); BufferedDataTable out = container.getTable(); BufferedDataTable statsTable = statsContainer.getTable(); logger.info("Matched "+n_seq+ " sequences, failed to match "+n_seq_rej+" sequences."); return new BufferedDataTable[]{out, statsTable}; } protected boolean save_sequence(BufferedDataContainer container, long n_seq, DataCell c1, DataCell c2, StringBuffer seq, String fname, SequenceStatistics stats) { if (c1 != null && c2 != null && seq != null) { RowKey key = new RowKey("Seq" + n_seq); // the cells of the current row, the types of the cells must match // the column spec (see above) DataCell[] cells = new DataCell[4]; cells[0] = c1; cells[1] = c2; String str = seq.toString(); cells[2] = new StringCell(str); cells[3] = new StringCell(fname); DataRow row = new DefaultRow(key, cells); container.addRowToTable(row); if (stats != null) { stats.grokSequence(str); } return true; } else { // NB: do not update stats object if bogus parameters... return false; } } protected Collection<StringCell> toDataCells(String[] vec) { ArrayList<StringCell> al = new ArrayList<StringCell>(); for (String s : vec) { if (s == null) // terminate add early if only a few entries valid break; al.add(new StringCell(s)); } return al; } protected String[] parse_accession(Pattern matcher, String[] entries) throws Exception { int cnt = 0; String[] accsns = new String[entries.length]; for (String entry : entries) { Matcher m = matcher.matcher(entry); if (m.find()) { if (m.groupCount() != 1) { throw new Exception("You must use capturing parentheses () to match an accession only once!"); } accsns[cnt] = m.group(1); cnt++; } } if (cnt < entries.length) { accsns[cnt] = null; // make sure array has null after last match } return (cnt > 0) ? accsns : null; } protected String[] parse_description(Pattern matcher, String[] entries) throws Exception { int cnt = 0; String[] descrs = new String[entries.length]; for (String entry : entries) { Matcher m = matcher.matcher(entry); if (m.find()) { if (m.groupCount() != 1) { throw new Exception("You must use capturing parentheses() to match a sequence description only once!"); } descrs[cnt] = m.group(1); cnt++; } } if (cnt < entries.length) { descrs[cnt] = null; } return (cnt > 0) ? descrs : null; } /** * {@inheritDoc} */ @Override protected void reset() { } /** * {@inheritDoc} */ @Override protected DataTableSpec[] configure(final DataTableSpec[] inSpecs) throws InvalidSettingsException { boolean as_single = m_entry_handler.getStringValue().equals("single"); DataTableSpec out = make_output_spec(as_single); DataTableSpec out2= SequenceStatistics.getOutputSpec(); return new DataTableSpec[] {out, out2}; } /** * {@inheritDoc} */ @Override protected void saveSettingsTo(final NodeSettingsWO settings) { m_fasta.saveSettingsTo(settings); m_accsn_re.saveSettingsTo(settings); m_descr_re.saveSettingsTo(settings); m_entry_handler.saveSettingsTo(settings); m_fastadir.saveSettingsTo(settings); m_isdir.saveSettingsTo(settings); m_stats.saveSettingsTo(settings); } /** * {@inheritDoc} */ @Override protected void loadValidatedSettingsFrom(final NodeSettingsRO settings) throws InvalidSettingsException { m_fasta.loadSettingsFrom(settings); m_accsn_re.loadSettingsFrom(settings); m_descr_re.loadSettingsFrom(settings); m_entry_handler.loadSettingsFrom(settings); m_fastadir.loadSettingsFrom(settings); m_isdir.loadSettingsFrom(settings); if (settings.containsKey(CFGKEY_MAKESTATS)) { m_stats.loadSettingsFrom(settings); } else { m_stats.setBooleanValue(Boolean.FALSE); } } /** * {@inheritDoc} */ @Override protected void validateSettings(final NodeSettingsRO settings) throws InvalidSettingsException { m_fasta.validateSettings(settings); m_accsn_re.validateSettings(settings); m_descr_re.validateSettings(settings); m_entry_handler.validateSettings(settings); m_fastadir.validateSettings(settings); m_isdir.validateSettings(settings); if (settings.containsKey(CFGKEY_MAKESTATS)) { m_stats.validateSettings(settings); } } protected void saveInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } protected void loadInternals(final File internDir, final ExecutionMonitor exec) throws IOException, CanceledExecutionException { } }